Load Packages

In [1]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Lars, Ridge, ElasticNet, LassoLars, LassoLarsCV, LinearRegression
import re
from umap import UMAP
import requests
import pandas as pd
from bs4 import BeautifulSoup
import seaborn as sns
import matplotlib.pyplot as plt
import gower
import pickle
from collections import Counter
import plotly.express as px
from xgboost import XGBRFRegressor
import shap
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

# import the real estate price analytics library
from lib.real_estate_analytics_library import *
In [2]:
# optional - suppress warnings
import warnings
warnings.filterwarnings('ignore')

Scrape Property Price Data

In [3]:
# the root page link is used to generate the links for all pages
root = 'https://en.comparis.ch/immobilien/result/list?requestobject=%7B%22DealType%22%3A20%2C%22SiteId%22%3A0%2C%22RootPropertyTypes%22%3A%5B%5D%2C%22PropertyTypes%22%3A%5B%5D%2C%22RoomsFrom%22%3Anull%2C%22RoomsTo%22%3Anull%2C%22FloorSearchType%22%3A0%2C%22LivingSpaceFrom%22%3Anull%2C%22LivingSpaceTo%22%3Anull%2C%22PriceFrom%22%3Anull%2C%22PriceTo%22%3Anull%2C%22ComparisPointsMin%22%3A0%2C%22AdAgeMax%22%3A0%2C%22AdAgeInHoursMax%22%3Anull%2C%22Keyword%22%3A%22%22%2C%22WithImagesOnly%22%3Anull%2C%22WithPointsOnly%22%3Anull%2C%22Radius%22%3Anull%2C%22MinAvailableDate%22%3A%221753-01-01T00%3A00%3A00%22%2C%22MinChangeDate%22%3A%221753-01-01T00%3A00%3A00%22%2C%22LocationSearchString%22%3A%22Z%C3%BCrich%22%2C%22Sort%22%3A3%2C%22HasBalcony%22%3Afalse%2C%22HasTerrace%22%3Afalse%2C%22HasFireplace%22%3Afalse%2C%22HasDishwasher%22%3Afalse%2C%22HasWashingMachine%22%3Afalse%2C%22HasLift%22%3Afalse%2C%22HasParking%22%3Afalse%2C%22PetsAllowed%22%3Afalse%2C%22MinergieCertified%22%3Afalse%2C%22WheelchairAccessible%22%3Afalse%2C%22LowerLeftLatitude%22%3Anull%2C%22LowerLeftLongitude%22%3Anull%2C%22UpperRightLatitude%22%3Anull%2C%22UpperRightLongitude%22%3Anull%7D&page='
In [4]:
# Open provided link using the requests package
# get the properties in Zürich, using the Comparis link for this result
links_page = requests.get('https://en.comparis.ch/immobilien/result/list?requestobject=%7B%22DealType%22%3A20%2C%22SiteId%22%3A0%2C%22RootPropertyTypes%22%3A%5B%5D%2C%22PropertyTypes%22%3A%5B%5D%2C%22RoomsFrom%22%3Anull%2C%22RoomsTo%22%3Anull%2C%22FloorSearchType%22%3A0%2C%22LivingSpaceFrom%22%3Anull%2C%22LivingSpaceTo%22%3Anull%2C%22PriceFrom%22%3Anull%2C%22PriceTo%22%3Anull%2C%22ComparisPointsMin%22%3A0%2C%22AdAgeMax%22%3A0%2C%22AdAgeInHoursMax%22%3Anull%2C%22Keyword%22%3A%22%22%2C%22WithImagesOnly%22%3Anull%2C%22WithPointsOnly%22%3Anull%2C%22Radius%22%3Anull%2C%22MinAvailableDate%22%3A%221753-01-01T00%3A00%3A00%22%2C%22MinChangeDate%22%3A%221753-01-01T00%3A00%3A00%22%2C%22LocationSearchString%22%3A%22Z%C3%BCrich%22%2C%22Sort%22%3A3%2C%22HasBalcony%22%3Afalse%2C%22HasTerrace%22%3Afalse%2C%22HasFireplace%22%3Afalse%2C%22HasDishwasher%22%3Afalse%2C%22HasWashingMachine%22%3Afalse%2C%22HasLift%22%3Afalse%2C%22HasParking%22%3Afalse%2C%22PetsAllowed%22%3Afalse%2C%22MinergieCertified%22%3Afalse%2C%22WheelchairAccessible%22%3Afalse%2C%22LowerLeftLatitude%22%3Anull%2C%22LowerLeftLongitude%22%3Anull%2C%22UpperRightLatitude%22%3Anull%2C%22UpperRightLongitude%22%3Anull%7D&page=0')
In [5]:
soup = BeautifulSoup(links_page.content, 'html.parser')
In [6]:
# get the page number links
links = list(l['href'] for l in soup.find_all("a",{"class":"css-1yj1f35 excbu0j4"}))
In [7]:
# get the number of pages available for the location in question
num_pages = int(links[-2][links[-2].find('page=') + 5:]) + 1
In [8]:
# generate the list of pages that contain properties for the location in question
property_links = [root + str(i) for i in range(0, num_pages, 1)]
In [9]:
# define the root that we will comine with the property ID, giving us the page for each property
root = 'https://en.comparis.ch/immobilien/marktplatz/details/show/'
In [10]:
# define the list for storing the specific page for each property
pages = []

for property_link in property_links:    
    page = requests.get(property_link)

    soup = BeautifulSoup(page.content, 'html.parser')

    raw_id_list = re.findall(r'"AdId":[-+]?[0-9]+,', str(soup))

    id_list = [raw_id[raw_id.find(':') + 1:raw_id.find(',')] for raw_id in raw_id_list]

    # comine the root with the property ID, giving us the page for each property
    pages.extend([root + i for i in id_list])
In [11]:
# get the attributes for each property from the Comparis website
properties = []

for p in pages:
    page = requests.get(p)
    soup = BeautifulSoup(page.content, 'html.parser')
    property_address = list(soup.find("h3",{"class":"text-green"}))
    property_attributes = list(soup.find("dl",{"class":"row xsmall-up-2 medium-up-3 large-up-4 attributes-grid"}).stripped_strings)
    properties.append([property_address, property_attributes])
In [12]:
# check the length of the property attributes list
len(properties)
Out[12]:
132
In [13]:
# define the list of attributes that will be gathered from the scraped data
property_type = []
property_price = []
living_space = []
rooms = []
floor = []
available_date = []
public_transport = []
motorway = []
shop = []
In [14]:
# flatten the property address list
property_address = [record[0][0] for record in properties]
In [15]:
# cycle through the scraped property data and separate it into attribute-based lists that will be used to 
# create a pandas DataFrame

for record in properties:
    
    try:
        property_type.append(record[1][record[1].index('Property type') + 1])
    except:
        property_type.append(None)
    
    try:
        property_price.append(float(record[1][record[1].index('Purchase price') + 1][4:].replace(',','')))
    except:
        property_price.append(None)

    try:
        living_space.append(float(record[1][record[1].index('Living space') + 1][:-3]))
    except:
        living_space.append(None)
    
    try:
        rooms.append(get_num_rooms(record[1][record[1].index('Rooms') + 1]))
    except:
        rooms.append(None)
    
    try:
        floor.append(record[1][record[1].index('Floor') + 1])
    except:
        floor.append(None)
    
    try:
        available_date.append(record[1][record[1].index('Available') + 1])
    except:
        available_date.append(None)
    
    try:
        public_transport.append(float(record[1][record[1].index('Public transport stop') + 1][:-2]))
    except:
        public_transport.append(None)
    
    try:
        motorway.append(float(record[1][record[1].index('Motorway') + 1][:-2]))
    except:
        motorway.append(None)
    
    try:
        shop.append(float(record[1][record[1].index('Shops') + 1][:-2]))
    except:
        shop.append(None)
In [16]:
# create a pandas DataFrame that contains the raw attributes that we have gathered
property_records = pd.DataFrame(list(zip(property_address, property_type, property_price, living_space, rooms, floor, available_date, public_transport, motorway, shop)), columns =['property_address', 'property_type', 'property_price', 'living_space', 'rooms', 'floor', 'available_date', 'public_transport', 'motorway', 'shop'])
In [17]:
# show DataFrame
property_records
Out[17]:
property_address property_type property_price living_space rooms floor available_date public_transport motorway shop
0 Zürichberg, 8044 Zürich Single-family house 2800000.0 150.0 NaN None Immediately NaN NaN NaN
1 Regensdorferstrasse 60, 8049 Zürich Apartment 3485000.0 192.0 5.5 3. floor 01/10/2022 250.0 2000.0 600.0
2 Regensdorferstrasse 60, 8049 Zürich Apartment 1760000.0 127.0 4.5 1. floor 01/10/2022 250.0 2000.0 600.0
3 Regensdorferstrasse 60, 8049 Zürich Apartment 1670000.0 121.0 3.5 1. floor 01/10/2022 250.0 2000.0 600.0
4 Regensdorferstrasse 60, 8049 Zürich Apartment 1570000.0 99.0 3.5 Ground floor 01/10/2022 250.0 2000.0 600.0
... ... ... ... ... ... ... ... ... ... ...
127 8005 Zürich Apartment 4850000.0 NaN 5.5 19. floor By arrangement NaN NaN NaN
128 Zanggerweg 9, 8006 Zürich Terraced/row house 3150000.0 170.0 4.5 None None NaN NaN NaN
129 8002 Zürich Single-family house 13500000.0 230.0 NaN None None NaN NaN NaN
130 Meisenrain 19, 8044 Gockhausen Single-family house 1850000.0 200.0 6.5 None None NaN NaN NaN
131 Leutschenbachstrasse 30, 8050 Zürich Attic apartment 2050000.0 136.0 3.5 19. floor None 300.0 3000.0 1000.0

132 rows × 10 columns

In [18]:
# save the scraped property records
property_records.to_csv('data/property_records_purchase.csv')

Process Data

In this section, we process the scraped web data. This involves encoding all features as the appropriate data type and performing imputation (i.e. encoding missing data points as the mean, median or mode of the existing data.

In [19]:
# load data
property_records = pd.read_csv('data/property_records_purchase.csv')
In [20]:
# display the ratio of missing values for the below features

print('property_price:', property_records.loc[property_records['property_price'].isna() == True].shape[0]/property_records.shape[0])
print('living_space:', property_records.loc[property_records['living_space'].isna() == True].shape[0]/property_records.shape[0])
print('rooms:', property_records.loc[property_records['rooms'].isna() == True].shape[0]/property_records.shape[0])
print('property_address:', property_records.loc[property_records['property_address'].isna() == True].shape[0]/property_records.shape[0])
print('floor:', property_records.loc[property_records['floor'].isna() == True].shape[0]/property_records.shape[0])
print('property_type:', property_records.loc[property_records['property_type'].isna() == True].shape[0]/property_records.shape[0])
print('shop:', property_records.loc[property_records['shop'].isna() == True].shape[0]/property_records.shape[0])
print('public_transport:', property_records.loc[property_records['public_transport'].isna() == True].shape[0]/property_records.shape[0])
print('motorway:', property_records.loc[property_records['motorway'].isna() == True].shape[0]/property_records.shape[0])
property_price: 0.24242424242424243
living_space: 0.30303030303030304
rooms: 0.24242424242424243
property_address: 0.0
floor: 0.5151515151515151
property_type: 0.0
shop: 0.6893939393939394
public_transport: 0.7045454545454546
motorway: 0.7424242424242424
In [21]:
# process the data for use in a price prediction model, pricing analytics
property_records = process_records(property_records)
In [22]:
property_records
Out[22]:
property_address property_type property_price living_space rooms floor available_date public_transport motorway shop ... Building land Commercial property Loft Maisonette Multi-family house Other Penthouse Semi-detached house Single-family house Terraced/row house
0 Zürichberg, 8044 Zürich Single-family house 2800000.0 150.00 4.25 1. floor Immediately 186.612903 1854.928571 425.90625 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
1 Regensdorferstrasse 60, 8049 Zürich Apartment 3485000.0 192.00 5.50 3. floor 01/10/2022 250.000000 2000.000000 600.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Regensdorferstrasse 60, 8049 Zürich Apartment 1760000.0 127.00 4.50 1. floor 01/10/2022 250.000000 2000.000000 600.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 Regensdorferstrasse 60, 8049 Zürich Apartment 1670000.0 121.00 3.50 1. floor 01/10/2022 250.000000 2000.000000 600.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 Regensdorferstrasse 60, 8049 Zürich Apartment 1570000.0 99.00 3.50 Ground floor 01/10/2022 250.000000 2000.000000 600.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95 8005 Zürich Apartment 4850000.0 149.32 5.50 19. floor By arrangement 186.612903 1854.928571 425.90625 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
96 Zanggerweg 9, 8006 Zürich Terraced/row house 3150000.0 170.00 4.50 1. floor NaN 186.612903 1854.928571 425.90625 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
97 8002 Zürich Single-family house 13500000.0 230.00 4.25 1. floor NaN 186.612903 1854.928571 425.90625 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
98 Meisenrain 19, 8044 Gockhausen Single-family house 1850000.0 200.00 6.50 1. floor NaN 186.612903 1854.928571 425.90625 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
99 Leutschenbachstrasse 30, 8050 Zürich Attic apartment 2050000.0 136.00 3.50 19. floor NaN 300.000000 3000.000000 1000.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

100 rows × 54 columns

In [23]:
property_records[property_records['property_type'].isna()]
Out[23]:
property_address property_type property_price living_space rooms floor available_date public_transport motorway shop ... Building land Commercial property Loft Maisonette Multi-family house Other Penthouse Semi-detached house Single-family house Terraced/row house

0 rows × 54 columns

In [24]:
# save the processed property records
property_records.to_csv('data/processed_property_records_purchase.csv')
In [25]:
# save the possible values for each feature
with open('data/possible_postcodes_purchase.pickle', 'wb') as handle:
    pickle.dump(list(property_records['property_postcode'].unique()), handle)

with open('data/possible_floors_purchase.pickle', 'wb') as handle:
    pickle.dump(list(property_records['floor'].unique()), handle)

with open('data/possible_types_purchase.pickle', 'wb') as handle:
    pickle.dump(list(property_records['property_type'].unique()), handle)

Model Selection and Training

In this section we will select, train and save two models - one tree-based model, and one linear regression-based model. The tree-based model will be selected because it has a lower mean absolute error, while the linear regression-based model will be used to extrapolate the price of real estate that falls outside of the range of the training data (i.e. very high-value real estate), since tree-based models cannot predict values that are higher than the highest target value in the dataset on which they are trained.

Note: the linear model assumes that there is a linear relationship between price and other features such as living space and number of rooms for larger properties outside of the dataset.

The methodology used in this Jupyter notebook assumes stability in the price data for the records that were scraped - that is, we assume that the prices did not significantly change over the time period covered by the property listings.

In [26]:
# load data
property_records = pd.read_csv('data/processed_property_records_purchase.csv')
In [27]:
x = property_records[[col for col in property_records.columns if col not in ['property_price', 'Unnamed: 0', 'property_address', 'available_date', 'property_type', 'floor', 'property_postcode']]]
y = property_records['property_price']
In [28]:
fig = px.scatter(property_records, x="rooms", y="property_price", color="property_type", title="Purchase Price vs Number of Rooms", hover_data=['property_postcode'])
fig.show()
In [29]:
fig = px.box(property_records, x="rooms", y="property_price", title="Purchase Price vs Number of Rooms", points=False)
fig.show()
In [30]:
fig = px.scatter(property_records, x="living_space", y="property_price", color="property_type", title="Purchase Price vs Living Space", hover_data=['property_postcode'])
fig.show()
In [31]:
fig = px.box(property_records, x="property_postcode", y="property_price", title="Purchase Price vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [32]:
fig = px.box(property_records, x="floor", y="property_price", title="Purchase Price vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [33]:
fig = px.box(property_records, x="property_type", y="property_price", title="Purchase Price vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [34]:
# scale the float features
columns = ['living_space', 'rooms', 'public_transport', 'motorway', 'shop']
scaler = StandardScaler().fit(x[columns])
scaled = scaler.transform(x[columns])
scaled = pd.DataFrame(scaled, columns=['scaled_' + column for column in columns])
x = pd.concat([x, scaled], axis=1)
In [35]:
# save the scaler model for later use
with open('data/scaler_purchase.pickle', 'wb') as handle:
    pickle.dump(scaler, handle)
In [36]:
x = x.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop'])
In [37]:
# calculate the correlation_matrix matrix of the features and the dependent variable
correlation_matrix = property_records[[col for col in property_records.columns if col not in ['Unnamed: 0', 'property_address', 'available_date', 'property_type', 'floor', 'property_postcode']]].corr().loc[['property_price']].drop(['property_price'], axis=1)
In [38]:
# visualize the correlation_matrix matrix
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(correlation_matrix, square=True, vmin=-1, vmax=1, ax=ax, linewidths=1, xticklabels=correlation_matrix.columns, cmap="Blues")
plt.yticks(rotation=0)
plt.show()
In [39]:
get_vifs(x)
Out[39]:
[('8000', inf), ('8001', inf), ('8002', inf), ('8003', inf), ('8005', inf), ('8006', inf), ('8008', inf), ('8032', inf), ('8038', inf), ('8041', inf), ('8044', inf), ('8046', inf), ('8048', inf), ('8049', inf), ('8050', inf), ('8051', inf), ('8052', inf), ('8053', inf), ('8057', inf), ('1. floor', inf), ('15. floor', inf), ('16. floor', inf), ('19. floor', inf), ('2. floor', inf), ('20. floor', inf), ('3. floor', inf), ('4. floor', inf), ('5. floor', inf), ('6. floor', inf), ('Basement', inf), ('Ground floor', inf), ('Apartment', inf), ('Attic apartment', inf), ('Building land', inf), ('Commercial property', inf), ('Loft', inf), ('Maisonette', inf), ('Multi-family house', inf), ('Other', inf), ('Penthouse', inf), ('Semi-detached house', inf), ('Single-family house', inf), ('Terraced/row house', inf), ('scaled_motorway', 3.1164162868091267), ('scaled_shop', 2.877682697378056), ('scaled_public_transport', 2.7016677069885167), ('scaled_living_space', 1.9114394154269956), ('scaled_rooms', 1.454644651463779)]

The above VIFs indicate, as expected, serious multicolinearty in the data. This is because of the the one hot encoding of the categorical data. In order to fix this problem, we can eliminate a column from each of the categorical feature sets. We will select the columns below, based on their frequency in the data. This should not result in any significant loss in the performance of the model, as the removed values will still be indicated in the data (because all of the remaining columns/features will be 0 if the removed value is present). For example, if we remove the 'Apartment' encoding, then any record for an apartment will have all other property_type encoding set to 0 (e.g features such as 'Single garage' will all be equal to 0).

In [40]:
sorted(Counter(property_records['property_postcode']).items(), key=lambda v: v[1], reverse=True)
Out[40]:
[(8052, 16), (8049, 12), (8044, 8), (8050, 8), (8046, 7), (8001, 7), (8002, 7), (8006, 5), (8041, 4), (8005, 4), (8038, 4), (8048, 3), (8032, 3), (8003, 3), (8057, 2), (8053, 2), (8008, 2), (8000, 2), (8051, 1)]
In [41]:
sorted(Counter(property_records['floor']).items(), key=lambda v: v[1], reverse=True)
Out[41]:
[('1. floor', 63), ('Ground floor', 15), ('3. floor', 6), ('16. floor', 3), ('2. floor', 3), ('Basement', 2), ('4. floor', 2), ('19. floor', 2), ('15. floor', 1), ('5. floor', 1), ('6. floor', 1), ('20. floor', 1)]
In [42]:
sorted(Counter(property_records['property_type']).items(), key=lambda v: v[1], reverse=True)
Out[42]:
[('Apartment', 45), ('Commercial property', 19), ('Single-family house', 10), ('Maisonette', 7), ('Attic apartment', 5), ('Multi-family house', 5), ('Other', 4), ('Semi-detached house', 1), ('Building land', 1), ('Loft', 1), ('Penthouse', 1), ('Terraced/row house', 1)]
In [43]:
# define the columns that are to be eliminated from the input features to the Linear Regression model. This is to 
# eliminated multicolinearity.
eliminated_columns = ['8052', '1. floor', 'Apartment']
In [44]:
# The below VIFs for the reduced data indicate no multicolinearity.
get_vifs(x.drop(columns=eliminated_columns))
Out[44]:
[('Commercial property', 3.401267339900829), ('8005', 3.1088976091630607), ('scaled_motorway', 3.0831479002586306), ('Basement', 2.9272164752229632), ('Maisonette', 2.912928687395057), ('8051', 2.87767247181627), ('scaled_shop', 2.784828560516953), ('scaled_public_transport', 2.7012278881815353), ('Multi-family house', 2.5658692493415423), ('Loft', 2.4111165763298814), ('8050', 2.361840159472342), ('8006', 2.109759716710904), ('8048', 2.0322547296389692), ('8008', 2.0089934666325986), ('3. floor', 2.001196138492436), ('8046', 1.9943886418566603), ('8038', 1.9802588209722294), ('8001', 1.9487418700219483), ('19. floor', 1.9200019373365782), ('scaled_living_space', 1.895071682432339), ('Single-family house', 1.8795570961545836), ('15. floor', 1.847641798533287), ('Penthouse', 1.7993717752684646), ('8002', 1.7853988806667738), ('Ground floor', 1.7785544116552872), ('Other', 1.7603668171749465), ('20. floor', 1.7476020597831439), ('Building land', 1.6789456305438486), ('8000', 1.6346470998140412), ('8049', 1.6173236079824707), ('16. floor', 1.5720396503727196), ('8057', 1.5607753116070242), ('8044', 1.5598245124793084), ('8003', 1.5189584507907836), ('Attic apartment', 1.4959849011864903), ('2. floor', 1.4937333783607172), ('8032', 1.4710266940468288), ('scaled_rooms', 1.4496349704925766), ('4. floor', 1.4484015880352004), ('Terraced/row house', 1.4297244019716087), ('5. floor', 1.368138880803493), ('6. floor', 1.2784131377322006), ('8041', 1.1342489920541787), ('Semi-detached house', 1.0963964921131517), ('8053', 1.053888144798789)]
In [45]:
# save the list of eliminated columns for later use
with open('data/eliminated_columns_purchase.pickle', 'wb') as handle:
    pickle.dump(eliminated_columns, handle)
In [46]:
# remove the outliers detected by Tukey's test - this reduced dataset will be used in the training of the linear 
# models
xe, ye = remove_outliers_tukeys_test(x.drop(columns=eliminated_columns), y)
In [47]:
# use the Gower distance to scale the data for inout into UMAP dimensionality-reduction, which takes into account
# the float inputs and their interaction with the one hot-encoded data
umap_results = UMAP(n_neighbors=20).fit_transform(gower.gower_matrix(pd.concat([y, x], axis=1)))
In [48]:
outlier_indices = get_outliers_isolation_forest(x, y, n_estimators=100, contamination=0.06)
normal_indices = [i for i in range(0, x.shape[0], 1) if i not in outlier_indices]
In [49]:
outliers = pd.DataFrame(zip([v[0] for v in umap_results[outlier_indices]], [v[1] for v in umap_results[outlier_indices]], ['Outlier' for i in range(0, len(outlier_indices), 1)]), columns=['Dimension 1', 'Dimension 2', 'Status'])
In [50]:
normal = pd.DataFrame(zip([v[0] for v in umap_results[normal_indices]], [v[1] for v in umap_results[normal_indices]], ['Normal' for i in range(0, len(outlier_indices), 1)]), columns=['Dimension 1', 'Dimension 2', 'Status'])
In [51]:
# save the UMAP results as a pandas DataFrame
umap_data = pd.concat([normal, outliers]).reset_index(drop=True)
In [52]:
# plot the UMAP results, showing the outliers vs normal data points, based on the isolation forest model
fig = px.scatter(umap_data, x="Dimension 1", y="Dimension 2", color="Status", title="UMAP Result", hover_data=[umap_data.index.values])
fig.show()
In [53]:
# remove the outliers detected by the isolation forest - this reduced dataset will be used in the training of the 
# tree-based models
xt, yt = remove_outliers_isolation_forest(x, y, n_estimators=100, contamination=0.06)
In [54]:
model_types = [['Lasso', Lasso()], ['Ridge', Ridge()], ['ElasticNet', ElasticNet()], ['LassoLars', LassoLars()], ['LassoLarsCV', LassoLarsCV()], ['Lars', Lars()], ['LinearRegression', LinearRegression()]]
In [55]:
model_results = train_model(xe, ye, model_types, 5)
In [56]:
# get the top 5 results, selected based on the mae metric
top_models = sorted(model_results, key=lambda v: v[4], reverse=False)[:5]
In [57]:
top_models
Out[57]:
[['LassoLarsCV', LassoLarsCV(copy_X=True, cv=None, eps=2.220446049250313e-16, fit_intercept=True, max_iter=500, max_n_alphas=1000, n_jobs=None, normalize=True, positive=False, precompute='auto', verbose=False), 1620347.2071018608, 942723.3652149268, 479495.67309758515], ['Ridge', Ridge(alpha=0.5618262674606866, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001), 1486887.4348484424, 964664.8906528577, 584006.9618431067], ['ElasticNet', ElasticNet(alpha=0.10317406507730789, copy_X=True, fit_intercept=True, l1_ratio=0.5, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False), 1409795.9763442005, 913030.4839520153, 591756.7638380262], ['LinearRegression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False), 1635021.444877556, 1114606.4576179758, 653670.7805931895], ['LassoLars', LassoLars(alpha=0.13516966061166694, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True, fit_path=True, max_iter=500, normalize=True, positive=False, precompute='auto', verbose=False), 1890230.1880311077, 1237526.6197827929, 672530.414830761]]
In [58]:
# train the best model on the expanded dataset
linear_pricing_model = model_results[0][1].fit(xe, ye)
linear_pricing_model_mae = top_models[0][4]
In [59]:
linear_pricing_model
Out[59]:
Lasso(alpha=0.8046022357874578, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
In [60]:
# save the selected model
with open('models/linear_pricing_model_purchase.pickle', 'wb') as handle:
    pickle.dump(linear_pricing_model, handle)

# save the model's MAE
with open('models/linear_pricing_model_purchase_mae.pickle', 'wb') as handle:
    pickle.dump(linear_pricing_model_mae, handle)
In [61]:
# calculate feature importances based on the regression coefficients
regression_interpretation = pd.DataFrame(sorted(list(zip(xe.columns, linear_pricing_model.coef_)), key=lambda v: abs(v[1]), reverse=False), columns=['Feature', 'Weight'])
In [62]:
# plot the regression corefficient-based feature importances
fig = px.scatter(regression_interpretation, x="Weight", y="Feature")
fig.update_yaxes(type='category')
fig.show()
In [63]:
model_types = [['XGBRFRegressor', XGBRFRegressor()], ['AdaBoostRegressor', AdaBoostRegressor()], ['RandomForestRegressor', RandomForestRegressor()], ['ExtraTreesRegressor', ExtraTreesRegressor()], ['DecisionTreeRegressor', DecisionTreeRegressor()], ['GradientBoostingRegressor', GradientBoostingRegressor()]]
In [64]:
model_results = train_model(xt, yt, model_types, 3)
In [65]:
# get the top 5 results, selected based on the mae metric
top_models = sorted(model_results, key=lambda v: v[4], reverse=False)[:5]
In [66]:
top_models
Out[66]:
[['ExtraTreesRegressor', ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse', max_depth=14, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False), 17246302.95560664, 5637670.58471385, 474751.83798300475], ['XGBRFRegressor', XGBRFRegressor(base_score=0.5, booster=None, colsample_bylevel=1, colsample_bynode=0.8, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints=None, learning_rate=1, max_delta_step=0, max_depth=18, min_child_weight=1, missing=nan, monotone_constraints=None, n_estimators=44, n_jobs=0, num_parallel_tree=44, objective='reg:squarederror', random_state=0, reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1, subsample=0.8, tree_method=None, validate_parameters=False, verbosity=None), 17003118.39162759, 5291022.041425151, 540161.5833333334], ['DecisionTreeRegressor', DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=12, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None, splitter='best'), 11402030.752307927, 3749802.292589877, 555029.6296296297], ['GradientBoostingRegressor', GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='ls', max_depth=4, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=42, n_iter_no_change=None, presort='deprecated', random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), 15048961.680324301, 4764502.508312247, 565966.277298337], ['RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=7, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=33, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False), 24775911.572335422, 8239464.437604542, 638096.0573706882]]
In [67]:
# train the best model on the expanded dataset
pricing_model = top_models[0][1].fit(xt, yt)
pricing_model_mae = top_models[0][4]
In [68]:
pricing_model
Out[68]:
ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=14, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=15, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)
In [69]:
# save the selected model
with open('models/pricing_model_purchase.pickle', 'wb') as handle:
    pickle.dump(pricing_model, handle)

# save the model's MAE
with open('models/pricing_model_purchase_mae.pickle', 'wb') as handle:
    pickle.dump(pricing_model_mae, handle)
In [70]:
# calculate and show the raw SHAP values for the model
# reference: https://christophm.github.io/interpretable-ml-book/shap.html

# load JS visualization code to notebook
shap.initjs()

explainer = shap.TreeExplainer(pricing_model)
shap_values = explainer.shap_values(xt)

shap.summary_plot(shap_values, xt)
In [71]:
# show the SHAP value-based relative model feature importances
shap.summary_plot(shap_values, xt, plot_type="bar")

Predict the price of any given property

In [72]:
# show the possible values for each feature
with open('data/possible_postcodes_purchase.pickle', 'rb') as handle:
    print('Possible Postcodes =', pickle.load(handle))

print('')

with open('data/possible_floors_purchase.pickle', 'rb') as handle:
    print('Possible Floors =', pickle.load(handle))

print('')

with open('data/possible_types_purchase.pickle', 'rb') as handle:
    print('Possible Property Types =', pickle.load(handle))
Possible Postcodes = ['8044', '8049', '8006', '8057', '8052', '8046', '8041', '8050', '8053', '8005', '8001', '8008', '8002', '8048', '8000', '8038', '8032', '8003', '8051']

Possible Floors = ['1. floor', '3. floor', 'Ground floor', 'Basement', '15. floor', '4. floor', '5. floor', '16. floor', '6. floor', '2. floor', '20. floor', '19. floor']

Possible Property Types = ['Single-family house', 'Apartment', 'Commercial property', 'Semi-detached house', 'Other', 'Maisonette', 'Building land', 'Attic apartment', 'Multi-family house', 'Loft', 'Penthouse', 'Terraced/row house']
In [73]:
# load data
property_records = pd.read_csv('data/processed_property_records_purchase.csv')
In [74]:
# load the pre-trained models and other required data from pickle files
with open('models/pricing_model_purchase.pickle', 'rb') as handle:
    pricing_model = pickle.load(handle)

with open('models/pricing_model_purchase_mae.pickle', 'rb') as handle:
    pricing_model_mae = pickle.load(handle)
    
with open('models/linear_pricing_model_purchase.pickle', 'rb') as handle:
    linear_pricing_model = pickle.load(handle)

with open('models/linear_pricing_model_purchase_mae.pickle', 'rb') as handle:
    linear_pricing_model_mae = pickle.load(handle)

with open('data/eliminated_columns_purchase.pickle', 'rb') as handle:
    eliminated_columns = pickle.load(handle)

with open('data/scaler_purchase.pickle', 'rb') as handle:
    scaler = pickle.load(handle)
    
with open('data/encoder_purchase.pickle', 'rb') as handle:
    encoder = pickle.load(handle)
In [75]:
# define the feature values for the property
living_space = 140
rooms = 6
postcode = '8001'
floor = '1. floor'
property_type = 'Apartment'
public_transport = 100
motorway = 100
shop = 100
In [76]:
input_values = encode_input(living_space, rooms, postcode, floor, property_type, public_transport, motorway, shop, scaler, encoder)
In [77]:
input_values
Out[77]:
living_space rooms public_transport motorway shop property_postcode floor property_type 8000 8001 ... Other Penthouse Semi-detached house Single-family house Terraced/row house scaled_living_space scaled_rooms scaled_public_transport scaled_motorway scaled_shop
0 140 6 100 100 100 8001 1. floor Apartment 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 -0.104644 0.335768 -1.425806 -2.783393 -1.93274

1 rows × 56 columns

In [78]:
# use one of: [regression_model, tree_model]
model_type = 'tree_model'
In [79]:
# calculate price
if model_type == 'regression_model':
    price = linear_pricing_model.predict(input_values.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop', 'property_postcode', 'floor', 'property_type'] + eliminated_columns))[0]
    mae = linear_pricing_model_mae
else:
    price = pricing_model.predict(input_values.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop', 'property_postcode', 'floor', 'property_type']))[0]
    mae = pricing_model_mae

calculated_price = pd.concat([pd.DataFrame([price], columns=['property_price']), input_values], axis=1)
In [80]:
calculated_price
Out[80]:
property_price living_space rooms public_transport motorway shop property_postcode floor property_type 8000 ... Other Penthouse Semi-detached house Single-family house Terraced/row house scaled_living_space scaled_rooms scaled_public_transport scaled_motorway scaled_shop
0 2.244444e+07 140 6 100 100 100 8001 1. floor Apartment 0.0 ... 0.0 0.0 0.0 0.0 0.0 -0.104644 0.335768 -1.425806 -2.783393 -1.93274

1 rows × 57 columns

In [81]:
print('Predicted Price =', price, '+/-', mae, 'CHF')
print('Price Range =', price - mae, 'to', price + mae, 'CHF')
Predicted Price = 22444444.444444444 +/- 474751.83798300475 CHF
Price Range = 21969692.60646144 to 22919196.28242745 CHF
In [82]:
# the predicted price of the property is shown as a red cross, and is plotted alongside properties that are in 
# it's peer group (i.e. properties that have the same number of rooms and the same property type)
fig = px.scatter(property_records[(property_records['rooms'] == rooms) & (property_records['property_type'] == property_type)], x="living_space", y="property_price", color="property_type", hover_data=['living_space'])
fig1 = px.scatter(calculated_price, x="living_space", y="property_price", title="Calculated Price vs Peer Group", hover_data=['property_postcode'])
fig1.update_traces(marker=dict(size=10, color='Red', symbol='x'))
fig.add_trace(fig1.data[0])
fig.show()
In [ ]: